import math
import numpy as np
import pandas as pd
## Data Visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
## Machine Learning
import sklearn.metrics as sklm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
import seaborn as sns
## Ignore warnings
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("dataset/DAT102x_Predicting_Heart_Disease_Mortality_-_Training_values.csv")
label = pd.read_csv("dataset/DAT102x_Predicting_Heart_Disease_Mortality_-_Training_labels.csv")
test = pd.read_csv("dataset/DAT102x_Predicting_Heart_Disease_Mortality_-_Test_values.csv")
test_Id = test["row_id"]
train.head()
Checking the data sets contains the duplicate values or not
print(train.shape)
print(train.row_id.unique().shape)
print(test.shape)
print(test.row_id.unique().shape)
full = pd.concat([train, test], axis = 0)
print(len(full))
full_NA = full.isnull().sum()
full_NA = full_NA.drop(full_NA[full_NA == 0].index).sort_values(ascending = False)
print(full_NA)
full["health__homicides_per_100k"].fillna(full["health__homicides_per_100k"].median(), inplace=True)
full["health__pct_excessive_drinking"].fillna(full["health__pct_excessive_drinking"].median(), inplace=True)
full["health__pct_adult_smoking"].fillna(full["health__pct_adult_smoking"].median(), inplace=True)
full["health__motor_vehicle_crash_deaths_per_100k"].fillna(full["health__motor_vehicle_crash_deaths_per_100k"].median(), inplace=True)
full["health__pop_per_dentist"].fillna(full["health__pop_per_dentist"].median(), inplace=True)
full["health__pop_per_primary_care_physician"].fillna(full["health__pop_per_primary_care_physician"].median(), inplace=True)
full["health__pct_low_birthweight"].fillna(full["health__pct_low_birthweight"].median(), inplace=True)
full["health__air_pollution_particulate_matter"].fillna(full["health__air_pollution_particulate_matter"].median(), inplace=True)
full["demo__pct_non_hispanic_african_american"].fillna(full["demo__pct_non_hispanic_african_american"].median(), inplace=True)
full["econ__pct_uninsured_children"].fillna(full["econ__pct_uninsured_children"].median(), inplace=True)
full["demo__pct_female"].fillna(full["demo__pct_female"].median(), inplace=True)
full["demo__pct_below_18_years_of_age"].fillna(full["demo__pct_below_18_years_of_age"].median(), inplace=True)
full["demo__pct_aged_65_years_and_older"].fillna(full["demo__pct_aged_65_years_and_older"].median(), inplace=True)
full["demo__pct_hispanic"].fillna(full["demo__pct_hispanic"].median(), inplace=True)
full["health__pct_adult_obesity"].fillna(full["health__pct_adult_obesity"].median(), inplace=True)
full["demo__pct_non_hispanic_white"].fillna(full["demo__pct_non_hispanic_white"].median(), inplace=True)
full["demo__pct_american_indian_or_alaskan_native"].fillna(full["demo__pct_american_indian_or_alaskan_native"].median(), inplace=True)
full["demo__pct_asian"].fillna(full["demo__pct_asian"].median(), inplace=True)
full["health__pct_diabetes"].fillna(full["health__pct_diabetes"].median(), inplace=True)
full["health__pct_physical_inacticity"].fillna(full["health__pct_physical_inacticity"].median(), inplace=True)
full["econ__pct_uninsured_adults"].fillna(full["econ__pct_uninsured_adults"].median(), inplace=True)
train = full[:train.shape[0]]
test = full[train.shape[0]:]
train = pd.merge(train, label, on = "row_id")
label = label["heart_disease_mortality_per_100k"]
##droping the row_id for statistical analysis
train.drop(["row_id"], axis=1, inplace=True)
test.drop(["row_id"], axis=1, inplace=True)
train.describe()
##Visualize the distribution of mortality rate of heart disease
sns.distplot(train.heart_disease_mortality_per_100k)
plt.show()
##finding the all features correlation with the heart disease and knows which feature has the strong relationship with the heart disease
corr = train.corr()
corr.sort_values(["heart_disease_mortality_per_100k"], ascending = False, inplace = True)
print(corr.heart_disease_mortality_per_100k)
# Now creating the density plot distribution to every feature to see thier distribution
def plot_density_hist(train, cols, bins = 10, hist = True):
for col in cols:
fig = plt.figure(figsize = (6,6))
sns.set_style("whitegrid")
sns.distplot(train[col].dropna(), hist = hist)
plt.title("Histogram of " + col)
plt.xlabel(col)
plt.ylabel("")
plt.show
num_cols = ["health__pct_physical_inacticity", "health__pct_diabetes", "health__pct_adult_obesity", "demo__pct_adults_less_than_a_high_school_diploma",
"health__pct_adult_smoking", "health__pct_low_birthweight", "health__motor_vehicle_crash_deaths_per_100k", "demo__death_rate_per_1k",
"health__homicides_per_100k", "demo__pct_adults_with_high_school_diploma", "demo__pct_non_hispanic_african_american",
"econ__pct_unemployment", "econ__pct_uninsured_adults", "health__pop_per_dentist", "health__pop_per_primary_care_physician",
"health__air_pollution_particulate_matter", "demo__birth_rate_per_1k", "demo__pct_below_18_years_of_age", "demo__pct_female",
"demo__pct_american_indian_or_alaskan_native", "econ__pct_uninsured_children", "demo__pct_aged_65_years_and_older", "demo__pct_hispanic",
"demo__pct_non_hispanic_white", "demo__pct_asian", "demo__pct_adults_with_some_college", "health__pct_excessive_drinking",
"econ__pct_civilian_labor", "demo__pct_adults_bachelors_or_higher"]
plot_density_hist(train, num_cols)
def plot_scatter_t(train, cols, col_y = "heart_disease_mortality_per_100k", alpha = 1.0):
for col in cols:
fig = plt.figure(figsize=(7,6))
ax = fig.gca()
train.plot.scatter(x = col, y = col_y, ax = ax, alpha = alpha)
ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col)
ax.set_xlabel(col)
ax.set_ylabel(col_y)
plt.show()
plot_scatter_t(train, num_cols, alpha = 0.2)
def plot_scatter_t(train, cols, col_y = "heart_disease_mortality_per_100k", alpha = 1.0):
for col in cols:
fig = plt.figure(figsize=(7,6))
ax = fig.gca()
train.plot.scatter(x = col, y = col_y, ax = ax, alpha = alpha)
ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col)
ax.set_xlabel(col)
ax.set_ylabel(col_y)
plt.show()
plot_scatter_t(train, num_cols, alpha = 0.2)
Calulating the categorial values count for each features
train["area__rucc"].value_counts()
train["area__urban_influence"].value_counts()
train["econ__economic_typology"].value_counts()
train["yr"].value_counts()
## Visualizing the counts of every features
cat_cols = ["area__rucc", "area__urban_influence", "econ__economic_typology",
"yr"]
def plot_bars(train, cols):
for col in cols:
fig = plt.figure(figsize=(6,6))
ax = fig.gca()
counts = train[col].value_counts()
counts.plot.bar(ax = ax, color = 'blue')
ax.set_title('Number of people by ' + col)
ax.set_xlabel(col)
ax.set_ylabel('Number of people')
plt.show()
plot_bars(train, cat_cols)
def plot_box(train, cols, col_y = "heart_disease_mortality_per_100k"):
for col in cols:
sns.set_style("whitegrid")
sns.boxplot(col, col_y, data = train)
plt.xlabel(col)
plt.ylabel(col_y)
plt.xticks(rotation=90)
plt.show()
plot_box(train, cat_cols)
Separating the label from training data set and encoding the all categorial variables to dummy variables
train.drop(["heart_disease_mortality_per_100k"], axis=1, inplace=True)
train = pd.get_dummies(train)
train.shape
test = pd.get_dummies(test)
test.shape
x_train, x_test, y_train, y_test = train_test_split(train, label, shuffle=True, train_size=0.8, random_state=0)
## stasarzing the values
scale = StandardScaler()
scale.fit(x_train)
x_train = scale.transform(x_train)
x_test = scale.transform(x_test)
classification1 = SVC(kernel="rbf",random_state=10)
classififcation1=classification1.fit(x_test, y_test)
pred1=classification1.predict(x_test)
print("accuracy score of SVM is")
score=metrics.accuracy_score(y_test,pred1)
print(score)
classification2=LogisticRegression(random_state=10,max_iter=10,C=2.0)
classification2=classification2.fit(x_test,y_test)
pred2=classification2.predict(x_test)
print(" accuracy score of Logistic Regression is")
score1=metrics.accuracy_score(y_test,pred2)
print(score1)